(headers <- colnames(x1)) # variable names
## [1] "libertarian" "govtrust" "polcynicism"
## [4] "postmat" "authoritarian" "tolerance"
## [7] "freespeech" "reactance" "internationalism"
## [10] "interventionism" "mediatrust" "fatalism"
## [13] "multiculture" "sci_reli" "conspiracy"
## [16] "collectivism" "individualism"
# asymmetry
for( i in 1:length(headers)){
hist(x1[,i] ,breaks = 20, main = headers[i])
abline(v=median(x1[,i]), col="red")
# cat ("Press [enter] to continue or [no|No] key to break")
# line <- readline()
# if (line == 'no' | line == 'No'){
# break
# }
}
par(mar = c(3, 8, 4, 2) + 0.1)
boxplot(data1, horizontal = T, las = 1, main = "before scaling")
boxplot(x1, horizontal = T, las = 1, main = "after scaling, mean = 1, sd = 1")
#boxplot(as.matrix(x1), horizontal = T, las = 1, main = "after scaling, mean = 1, sd = 1")
par(mar = c(5, 4, 2, 3) + 0.1)
#correlation among variables
cov <- cor(data1)
cov[ row(cov) < col(cov)] <- 0; which(abs(cov) > 0.5, arr.ind = T)
## row col
## libertarian 1 1
## govtrust 2 2
## mediatrust 11 2
## multiculture 13 2
## polcynicism 3 3
## postmat 4 4
## authoritarian 5 5
## tolerance 6 6
## freespeech 7 7
## reactance 8 8
## internationalism 9 9
## interventionism 10 10
## mediatrust 11 11
## multiculture 13 11
## fatalism 12 12
## multiculture 13 13
## sci_reli 14 14
## conspiracy 15 15
## collectivism 16 16
## individualism 17 17
balloon.plot(cov, rowlabel = headers, collabel = headers)
## k - means how many clusters?
k = 1:20
wss <- rep(0,20)
xm <- colMeans(x1)
wss[1] = sum(x1^2)- nrow(x1)*sum(xm^2)
for(i in 2:20){
model <- kmeans(x1, centers = i, nstart =20, iter.max = 30)
wss[i] <- sum(model$withinss)
}
plot(wss, type ="b"); abline(v=6, col ="red", lty =2)
k = 9
set.seed(100)
km1 <- kmeans(x1, k, nstart = 100, iter.max = 30)
ord <- order(km1$centers[,1], decreasing = T)
km1 <- kmeans(x1, centers = km1$centers[ord,], nstart = 1, iter.max = 30)
km1$size
## [1] 289 291 185 267 341 257 297 355 279
Z <-matrix(0, nNodes, k )
for ( i in 1:k){ Z[km1$cluster ==i, i ] <- 1}
NZ <- Z %*% Diagonal(k, km1$size^(-1))
Dmat <- as.matrix(dist(x = x1))
Bmat <- t(NZ) %*% Dmat %*% NZ
ggplot(melt(as.matrix(Bmat)),
aes(x=as.factor(X1), y= as.factor(X2), fill=value)) +
geom_tile()+ labs(title = "Avg.Distances of Points in/out Cluster(K-means)") + xlab("") + ylab("")
plt.centers(x1, km1$cluster)
q1 <- plt.mds(mds_fit$points, km1$cluster, xlabel = "MDS Coordinate 1",
ylabel = "MDS Coordinate 2", main_title = "clusters in low dimension (Kmeans)")
q1
stree <- hclust(dist(x1),method ="complete")
plot(stree, labels = rep("",nNodes), ylab ="Distance",
main="Complete Linkages")
m <- length(stree$height)
k = 5 # number of clusters
ht <- 0.5*(stree$height[m-k+1]+stree$height[m-k+2]);
abline(h=ht,lty=2, col ="red")
hlabs <- cutree(stree, h = ht); table(hlabs)
## hlabs
## 1 2 3 4 5
## 395 1170 17 366 613
plt.centers(x1, hlabs)
q2 <- plt.mds(mds_fit$points, hlabs, xlabel = "MDS Coordinate 1",
ylabel = "MDS Coordinate 2", main_title = "clusters in low dimension(Hierachical)")
q2
library(mclust)
## Package 'mclust' version 5.2
## Type 'citation("mclust")' for citing this R package in publications.
mclus <- Mclust(x1, G = k )
mlabs<- mclus$classification
plot(x =mds_fit$points[,1], y = mds_fit$points[,2], xlab="Coordinate 1", ylab="Coordinate 2", type = 'n', main = "Hierachical Clustering")
text(x= mds_fit$points[,1], y = mds_fit$points[,2], labels = mlabs, col = mlabs, cex=.7)
plt.centers(x1, labs = mlabs )
q3 <- plt.mds(mds_fit$points, mlabs, xlabel = "MDS Coordinate 1",
ylabel = "MDS Coordinate 2", main_title = "clusters in low dimension (GMM)")
q3
# weighted graph
distM <-as.matrix(dist(x1))
s = quantile(distM, 0.05)
S <- exp(- distM/s); diag(S) <- 0
D <- rowSums(S)
L <- Diagonal(nNodes, (1+D)^(-1/2)) %*% S%*% Diagonal(nNodes, (1+D)^(-1/2))
svd1 <- irlba::irlba(L, nv = 20)
plot(svd1$d)
k = 6
U <- svd1$u[,1:k]; rowN <- sqrt(rowSums(U*U) +1e-6);
U1 <- Diagonal(nNodes, rowN^(-1)) %*% U
km2 <- kmeans(U1, k, iter.max = 30, nstart = 30 )
plt.centers(x1, km2$cluster)
q4<- plt.mds(mds_fit$points, km2$cluster, xlabel = "MDS Coordinate 1",
ylabel = "MDS Coordinate 2", main_title = "clusters in low dimension")
q41 <- plt.mds(U1[,2:3], km2$cluster, xlabel = "SC -2 ",
ylabel = "SC-3", main_title = "clusters in low dimension(SC-weighted)")
q4
q41
distM <-as.matrix(dist(x1)); s = quantile(distM, 0.05)
S <- exp(- distM/s); diag(S) <- 0
A <- matrix(0, nNodes, nNodes);
eps = quantile(S, 0.96) # 95% sparse, 50 edges
for( i in 1:nNodes ){
#A[i, S[i,]>eps] <- 1
A[i, order(-S[i,])[1:10]]<-1
}
diag(A) <- 0
A <- 1* (A+t(A)>0)
D <- rowSums(A)
L <- Diagonal(nNodes, (D+1)^(-1/2)) %*% A %*% Diagonal(nNodes, (D+1)^(-1/2))
svd1 <- irlba(L, nv = 20)
plot(svd1$d); abline(v = 6, col ="red", lty=2)
k=6
U <- svd1$u[,1:k]; rowN <- sqrt(rowSums(U*U) +1e-6); U1 <- Diagonal(nNodes, rowN^(-1)) %*% U
km2 <- kmeans(U1, k, iter.max = 30, nstart = 30 )
plt.centers(x1, km2$cluster)
q5 <- plt.mds(mds_fit$points, km2$cluster, xlabel = "MDS Coordinate 1",
ylabel = "MDS Coordinate 2", main_title = "clusters in low dimension(SC-kernel-truncated)")
q51 <- plt.mds(U1[,2:3], km2$cluster, xlabel = "2nd Eigenvector",
y= "3rd Eigenvector", main_title = "clusters in low dimension(SC-kernel-truncated)")
q5
q51
# dbscan(x, eps, minPts = 5, weights = NULL,
# borderPoints = TRUE, search = "kdtree", bucketSize = 10,
# splitRule = "suggest", approx = 0, ...)
# eps :size of the epsilon neighborhood.
# minPts
library(dbscan)
x.pca <- princomp(x1)
db <- dbscan(x.pca$scores[,1:4], eps = 0.5)
db
## DBSCAN clustering for 2561 objects.
## Parameters: eps = 0.5, minPts = 5
## The clustering contains 35 cluster(s) and 1610 noise points.
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 1610 390 315 13 7 6 44 5 6 7 6 3 7 5 4
## 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
## 8 5 4 9 9 6 5 8 5 9 2 10 10 5 4
## 30 31 32 33 34 35
## 7 5 6 6 5 5
##
## Available fields: cluster, eps, minPts
db <- dbscan(x.pca$scores[,1:4], eps = 0.4)
db
## DBSCAN clustering for 2561 objects.
## Parameters: eps = 0.4, minPts = 5
## The clustering contains 29 cluster(s) and 2192 noise points.
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 2192 23 13 91 5 60 5 14 7 12 5 7 12 12 14
## 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
## 10 7 7 5 7 6 5 4 7 5 5 5 6 5 5
##
## Available fields: cluster, eps, minPts
db <- dbscan(x.pca$scores[,1:4], eps = 0.3)
db
## DBSCAN clustering for 2561 objects.
## Parameters: eps = 0.3, minPts = 5
## The clustering contains 6 cluster(s) and 2465 noise points.
##
## 0 1 2 3 4 5 6
## 2465 6 53 14 7 9 7
##
## Available fields: cluster, eps, minPts
db <- dbscan(x.pca$scores[,1:4], eps = 0.2)
db
## DBSCAN clustering for 2561 objects.
## Parameters: eps = 0.2, minPts = 5
## The clustering contains 3 cluster(s) and 2510 noise points.
##
## 0 1 2 3
## 2510 41 5 5
##
## Available fields: cluster, eps, minPts
db <- dbscan(x.pca$scores[,1:4], eps = 0.1)
db
## DBSCAN clustering for 2561 objects.
## Parameters: eps = 0.1, minPts = 5
## The clustering contains 1 cluster(s) and 2528 noise points.
##
## 0 1
## 2528 33
##
## Available fields: cluster, eps, minPts
plt.centers(x1, db$cluster)
q6 <- plt.mds(mds_fit$points, db$cluster, xlabel = "MDS Coordinate 1",
ylabel = "MDS Coordinate 2", main_title = "clusters in low dimension(DB-SCAN)")
q6